#!/usr/bin/env python3
"""
HDGL Resilience & Recovery System
Handles chain failures with grace and mathematical elegance

Design Philosophy:
- Solana is SOURCE OF TRUTH (canonical state)
- Ethereum is COMMITMENT REGISTRY (proof of history)
- Bitcoin references commitments (doesn't fail, just waits)
- VM can run offline and sync later

Failure Modes:
1. Solana down → Use local cache, resume when back
2. Ethereum down → Queue commitments, batch submit later
3. Both down → Pure local evolution, full resync when recovered
4. Partial network → Degrade gracefully, prioritize critical operations
"""

import json
import time
import pickle
from decimal import Decimal
from typing import Optional, Dict, List, Tuple
from dataclasses import dataclass, asdict
from pathlib import Path
import logging

from hdgl_vm_unified import HDGLVM, HDGLState, HDGLField

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ============================================
# Resilient State Management
# ============================================

@dataclass
class StateCheckpoint:
    """Immutable state checkpoint with φ-based validation"""
    state: HDGLState
    commitment: bytes
    timestamp: float
    evolution_count: int
    source: str  # 'solana', 'local', 'recovered'
    
    def is_valid(self) -> bool:
        """Verify checkpoint integrity using φ-properties"""
        # Check all dimensions are in valid range
        for dim in self.state.dimensions:
            if dim < Decimal(0) or dim > HDGLField.SATURATE_SLOT:
                return False
        
        # Verify commitment matches state
        computed = self.state.commitment_hash()
        if computed != self.commitment:
            return False
        
        # Check φ-coherence: D7 should be related to D1+D4 by φ
        expected_d7 = (self.state.dimensions[0] * HDGLField.PHI + 
                      self.state.dimensions[3] * HDGLField.INV_PHI) / HDGLField.PHI_SQ
        actual_d7 = self.state.dimensions[6]
        
        # Allow 10% deviation (analog tolerance)
        deviation = abs(actual_d7 - expected_d7) / (expected_d7 + Decimal(1))
        return deviation < Decimal("0.1")

class StateCache:
    """Local persistent cache with automatic checkpointing"""
    
    def __init__(self, cache_dir: str = ".hdgl_cache"):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(exist_ok=True)
        self.checkpoint_file = self.cache_dir / "latest_checkpoint.pkl"
        self.history_file = self.cache_dir / "checkpoint_history.jsonl"
        self.max_history = 1000
    
    def save_checkpoint(self, checkpoint: StateCheckpoint):
        """Save checkpoint to disk"""
        try:
            # Save latest checkpoint (binary)
            with open(self.checkpoint_file, 'wb') as f:
                pickle.dump(checkpoint, f)
            
            # Append to history (text, for debugging)
            history_entry = {
                'timestamp': checkpoint.timestamp,
                'evolution_count': checkpoint.evolution_count,
                'commitment': checkpoint.commitment.hex(),
                'source': checkpoint.source,
                'dimensions': [float(d) for d in checkpoint.state.dimensions]
            }
            
            with open(self.history_file, 'a') as f:
                f.write(json.dumps(history_entry) + '\n')
            
            # Trim history if too large
            self._trim_history()
            
        except Exception as e:
            logger.error(f"Failed to save checkpoint: {e}")
    
    def load_checkpoint(self) -> Optional[StateCheckpoint]:
        """Load latest valid checkpoint"""
        try:
            if not self.checkpoint_file.exists():
                return None
            
            with open(self.checkpoint_file, 'rb') as f:
                checkpoint = pickle.load(f)
            
            # Validate checkpoint
            if checkpoint.is_valid():
                logger.info(f"Loaded checkpoint from {checkpoint.source} "
                          f"(evolution #{checkpoint.evolution_count})")
                return checkpoint
            else:
                logger.warning("Checkpoint validation failed, discarding")
                return None
                
        except Exception as e:
            logger.error(f"Failed to load checkpoint: {e}")
            return None
    
    def get_history(self, count: int = 10) -> List[Dict]:
        """Get recent checkpoint history"""
        try:
            if not self.history_file.exists():
                return []
            
            with open(self.history_file, 'r') as f:
                lines = f.readlines()
            
            # Return last N entries
            recent = lines[-count:] if len(lines) > count else lines
            return [json.loads(line) for line in recent]
            
        except Exception as e:
            logger.error(f"Failed to read history: {e}")
            return []
    
    def _trim_history(self):
        """Keep only recent history entries"""
        try:
            if not self.history_file.exists():
                return
            
            with open(self.history_file, 'r') as f:
                lines = f.readlines()
            
            if len(lines) > self.max_history:
                # Keep last max_history entries
                with open(self.history_file, 'w') as f:
                    f.writelines(lines[-self.max_history:])
                    
        except Exception as e:
            logger.error(f"Failed to trim history: {e}")

# ============================================
# Commitment Queue (for Ethereum failures)
# ============================================

class CommitmentQueue:
    """Queue commitments when Ethereum is unavailable"""
    
    def __init__(self, queue_file: str = ".hdgl_cache/commitment_queue.json"):
        self.queue_file = Path(queue_file)
        self.queue_file.parent.mkdir(exist_ok=True)
        self.max_queue_size = 100
    
    def enqueue(self, commitment: bytes, evolution_count: int):
        """Add commitment to queue"""
        queue = self._load_queue()
        
        queue.append({
            'commitment': commitment.hex(),
            'evolution_count': evolution_count,
            'timestamp': time.time()
        })
        
        # Keep queue bounded
        if len(queue) > self.max_queue_size:
            queue = queue[-self.max_queue_size:]
        
        self._save_queue(queue)
        logger.info(f"Queued commitment (queue size: {len(queue)})")
    
    def dequeue_batch(self, max_count: int = 10) -> List[Tuple[bytes, int]]:
        """Get batch of commitments to submit"""
        queue = self._load_queue()
        
        if not queue:
            return []
        
        # Get up to max_count oldest entries
        batch = queue[:max_count]
        remaining = queue[max_count:]
        
        self._save_queue(remaining)
        
        # Convert back to bytes
        return [(bytes.fromhex(item['commitment']), item['evolution_count']) 
                for item in batch]
    
    def size(self) -> int:
        """Get current queue size"""
        return len(self._load_queue())
    
    def _load_queue(self) -> List[Dict]:
        """Load queue from disk"""
        try:
            if self.queue_file.exists():
                with open(self.queue_file, 'r') as f:
                    return json.load(f)
        except:
            pass
        return []
    
    def _save_queue(self, queue: List[Dict]):
        """Save queue to disk"""
        try:
            with open(self.queue_file, 'w') as f:
                json.dump(queue, f)
        except Exception as e:
            logger.error(f"Failed to save queue: {e}")

# ============================================
# Chain Health Monitoring
# ============================================

class ChainHealth:
    """Monitor health of all chains with exponential backoff"""
    
    def __init__(self):
        self.status = {
            'solana': {'healthy': True, 'last_success': time.time(), 'failures': 0},
            'ethereum': {'healthy': True, 'last_success': time.time(), 'failures': 0}
        }
        self.backoff_base = 5  # seconds
        self.max_backoff = 300  # 5 minutes
    
    def record_success(self, chain: str):
        """Record successful operation"""
        self.status[chain]['healthy'] = True
        self.status[chain]['last_success'] = time.time()
        self.status[chain]['failures'] = 0
    
    def record_failure(self, chain: str):
        """Record failed operation"""
        self.status[chain]['failures'] += 1
        
        # Mark unhealthy after 3 consecutive failures
        if self.status[chain]['failures'] >= 3:
            self.status[chain]['healthy'] = False
    
    def is_healthy(self, chain: str) -> bool:
        """Check if chain is healthy"""
        return self.status[chain]['healthy']
    
    def get_backoff(self, chain: str) -> float:
        """Get backoff time using exponential strategy"""
        failures = self.status[chain]['failures']
        backoff = min(self.backoff_base * (2 ** failures), self.max_backoff)
        return backoff
    
    def should_retry(self, chain: str) -> bool:
        """Check if enough time has passed to retry"""
        last_success = self.status[chain]['last_success']
        backoff = self.get_backoff(chain)
        return (time.time() - last_success) >= backoff
    
    def get_status(self) -> Dict:
        """Get overall health status"""
        return {
            chain: {
                'healthy': info['healthy'],
                'failures': info['failures'],
                'time_since_success': time.time() - info['last_success']
            }
            for chain, info in self.status.items()
        }

# ============================================
# Resilient Bridge Adapter
# ============================================

class ResilientBridge:
    """Bridge with automatic failure recovery"""
    
    def __init__(self, solana_adapter, ethereum_adapter):
        self.solana = solana_adapter
        self.ethereum = ethereum_adapter
        
        self.cache = StateCache()
        self.commitment_queue = CommitmentQueue()
        self.health = ChainHealth()
        self.vm = HDGLVM()
        
        # Failure mode state
        self.offline_mode = False
        self.last_solana_sync = time.time()
        self.last_ethereum_sync = time.time()
    
    def read_state_resilient(self) -> Optional[HDGLState]:
        """Read state with fallback to cache"""
        
        # Try Solana first (source of truth)
        if self.health.is_healthy('solana') or self.health.should_retry('solana'):
            try:
                state = self.solana.read_state()
                if state:
                    self.health.record_success('solana')
                    
                    # Save checkpoint
                    checkpoint = StateCheckpoint(
                        state=state,
                        commitment=state.commitment_hash(),
                        timestamp=time.time(),
                        evolution_count=state.memory.get('evolution_count', 0),
                        source='solana'
                    )
                    self.cache.save_checkpoint(checkpoint)
                    
                    self.last_solana_sync = time.time()
                    self.offline_mode = False
                    
                    return state
            except Exception as e:
                logger.warning(f"Solana read failed: {e}")
                self.health.record_failure('solana')
        
        # Fallback to cache
        logger.info("Using cached state (Solana unavailable)")
        checkpoint = self.cache.load_checkpoint()
        
        if checkpoint:
            self.offline_mode = True
            return checkpoint.state
        
        # No cache available - initialize fresh
        logger.warning("No cached state, initializing fresh")
        return HDGLState()
    
    def write_state_resilient(self, state: HDGLState) -> bool:
        """Write state with queueing on failure"""
        
        # Always save to cache first
        checkpoint = StateCheckpoint(
            state=state,
            commitment=state.commitment_hash(),
            timestamp=time.time(),
            evolution_count=state.memory.get('evolution_count', 0),
            source='local'
        )
        self.cache.save_checkpoint(checkpoint)
        
        # Try Solana write
        if self.health.is_healthy('solana') or self.health.should_retry('solana'):
            try:
                success = self.solana.write_state(state)
                if success:
                    self.health.record_success('solana')
                    self.last_solana_sync = time.time()
                    logger.info("✓ State synced to Solana")
                    return True
            except Exception as e:
                logger.warning(f"Solana write failed: {e}")
                self.health.record_failure('solana')
        
        logger.info("⚠ State cached locally (Solana unavailable)")
        return False
    
    def submit_commitment_resilient(self, commitment: bytes, evolution_count: int) -> bool:
        """Submit commitment with queueing"""
        
        # Try Ethereum first
        if self.health.is_healthy('ethereum') or self.health.should_retry('ethereum'):
            try:
                tx_hash = self.ethereum.submit_commitment(commitment)
                if tx_hash:
                    self.health.record_success('ethereum')
                    self.last_ethereum_sync = time.time()
                    logger.info(f"✓ Commitment submitted: {commitment.hex()[:16]}...")
                    
                    # Process queued commitments if any
                    self._process_commitment_queue()
                    return True
            except Exception as e:
                logger.warning(f"Ethereum submission failed: {e}")
                self.health.record_failure('ethereum')
        
        # Queue for later
        self.commitment_queue.enqueue(commitment, evolution_count)
        logger.info(f"⚠ Commitment queued (Ethereum unavailable)")
        return False
    
    def _process_commitment_queue(self):
        """Process queued commitments"""
        queue_size = self.commitment_queue.size()
        if queue_size == 0:
            return
        
        logger.info(f"Processing {queue_size} queued commitments...")
        batch = self.commitment_queue.dequeue_batch(max_count=5)
        
        for commitment, evolution_count in batch:
            try:
                self.ethereum.submit_commitment(commitment)
                logger.info(f"  ✓ Submitted queued commitment #{evolution_count}")
            except Exception as e:
                logger.warning(f"  ✗ Failed to submit: {e}")
                # Re-queue if failed
                self.commitment_queue.enqueue(commitment, evolution_count)
                break
    
    def get_resilience_status(self) -> Dict:
        """Get current resilience status"""
        return {
            'offline_mode': self.offline_mode,
            'chain_health': self.health.get_status(),
            'cached_checkpoints': len(self.cache.get_history()),
            'queued_commitments': self.commitment_queue.size(),
            'time_since_solana_sync': time.time() - self.last_solana_sync,
            'time_since_ethereum_sync': time.time() - self.last_ethereum_sync
        }
    
    def force_resync(self):
        """Force full resynchronization"""
        logger.info("Forcing full resync...")
        
        # Reset health status
        self.health.status['solana']['failures'] = 0
        self.health.status['ethereum']['failures'] = 0
        
        # Try to read from Solana
        state = self.read_state_resilient()
        if state:
            logger.info("✓ Solana resync successful")
        
        # Try to process queue
        if self.health.is_healthy('ethereum'):
            self._process_commitment_queue()
            logger.info("✓ Ethereum queue processed")

# ============================================
# Recovery Strategies
# ============================================

class RecoveryStrategies:
    """Different recovery strategies for different failure modes"""
    
    @staticmethod
    def solana_only_failure(bridge: ResilientBridge):
        """Handle Solana-only failure"""
        logger.info("Recovery: Solana down, using local evolution")
        
        # Continue evolving locally
        state = bridge.cache.load_checkpoint().state
        
        # Ethereum still works - submit commitments
        commitment = state.commitment_hash()
        bridge.submit_commitment_resilient(commitment, 0)
        
        logger.info("→ Operating in cached mode, will resync when Solana returns")
    
    @staticmethod
    def ethereum_only_failure(bridge: ResilientBridge):
        """Handle Ethereum-only failure"""
        logger.info("Recovery: Ethereum down, queueing commitments")
        
        # Solana works - state is canonical
        state = bridge.read_state_resilient()
        
        # Queue commitments for when Ethereum returns
        commitment = state.commitment_hash()
        bridge.commitment_queue.enqueue(commitment, 0)
        
        logger.info(f"→ Commitments queued: {bridge.commitment_queue.size()}")
    
    @staticmethod
    def both_chains_failure(bridge: ResilientBridge):
        """Handle total network failure"""
        logger.info("Recovery: All chains down, pure local mode")
        
        # Load from cache
        checkpoint = bridge.cache.load_checkpoint()
        if not checkpoint:
            logger.error("No checkpoint available!")
            return
        
        # Continue evolving locally
        logger.info("→ Operating offline, will full-resync when network returns")
    
    @staticmethod
    def recovery_after_failure(bridge: ResilientBridge):
        """Recover after chains come back online"""
        logger.info("Recovery: Chains back online, synchronizing...")
        
        # Force resync
        bridge.force_resync()
        
        # Verify state consistency
        solana_state = bridge.solana.read_state()
        cached_checkpoint = bridge.cache.load_checkpoint()
        
        if solana_state and cached_checkpoint:
            # Compare commitments
            solana_commit = solana_state.commitment_hash()
            cached_commit = cached_checkpoint.commitment
            
            if solana_commit == cached_commit:
                logger.info("✓ State is consistent")
            else:
                logger.warning("⚠ State diverged, using Solana as source of truth")
                # Solana is always canonical
                bridge.cache.save_checkpoint(StateCheckpoint(
                    state=solana_state,
                    commitment=solana_commit,
                    timestamp=time.time(),
                    evolution_count=0,
                    source='solana_recovery'
                ))

# Export
__all__ = ['ResilientBridge', 'StateCache', 'CommitmentQueue', 
           'ChainHealth', 'RecoveryStrategies']
